In this notebook we played around with number of cold iterations.
In [35]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time
read text
In [36]:
def read_text_words(filename, wordsnumber):
with open(filename) as f:
X = f.readlines()
X = X[:wordsnumber]
X = ''.join(X)
X = X.replace('\n', '')
return X
def read_text_whole(filename):
with open(filename) as f:
X = f.read()
X = X.replace('\n', '')
return X
def chop_text_to_size(text, size):
return text[:1024*1024*size]
def read_text_filesize(filename, size):
with open(filename) as f:
X = f.read(1024*1024*size)
X = X.replace('\n', '')
return X
counts
In [37]:
def get_unicount(text):
length = len(text)
counts = np.zeros(26)
for i in xrange(length):
c = ord(text[i])
counts[c-97]+=1
#97-122
return counts
bigram statistics
In [ ]:
def get_bigram_stats_dic(text):
length = len(text)
dic = {}
for i in 'abcdefghijklmnopqrstuvwxyz':
for j in 'abcdefghijklmnopqrstuvwxyz':
dic[(i,j)]=0
for i in xrange(length-1):
if (text[i], text[i+1]) in dic:
dic[(text[i], text[i+1])] += 1
for k,v in dic.items():
dic[k] = v/(counts[ord(k[0])-97])
return dic
quality
In [68]:
def quality(decrypted, original):
l = len(decrypted)
zipped = zip(decrypted, original)
return sum(1.0 for x,y in zipped if x == y)/l
crypt
In [39]:
def crypt(text):
p = range(26)
random.shuffle(p)
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(p[x] + ord('a')))
except:
pass
return output, p
unnormalized desiredPDF (likelihood)
In [44]:
def get_desiredPDF_bigram(permutation):
logp = 0
for i in xrange(len(encrypted)-1):
pr = stats[chr(permutation[ord(encrypted[i])-97]+97),
chr(permutation[ord(encrypted[i+1])-97]+97)]
if pr>0:
logp += math.log(pr)
else:
logp += -9 #penalty for non existant pairs
return logp
metropolis
In [41]:
def metropolis( desiredPDF, initValue, computableRVS, skipIterations = iterations ):
random_variable = initValue
random_variableDensityValue = desiredPDF( random_variable )
for i in xrange( skipIterations ):
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
"""
next candidate for sample, generated by computableRVS
"""
#acceptanceProb = min( 1, candidateDensityValue / random_variableDensityValue )
#logp is returnd by desiredPDF_bigram, so here is the change
acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
#now when the procces is converged to desired distribution,
#return acceptable candidates
print "-----"
while True:
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
#acceptanceProb = min( 1, candidateDensityValue / random_variableDensityValue )
# logp is returnd by desiredPDF_bigram, so here is the change
acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
yield random_variable, random_variableDensityValue
permutation generator and computablervs
In [43]:
def uniform( n ):
#initialize permutation with identical
permutation = [ i for i in xrange( n ) ]
#swap ith object with random onject from i to n - 1 enclusively
for i in xrange( n ):
j = random.randint( i, n - 1 )
permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
return permutation
def applyTransposition( basePermutation ):
n = len( basePermutation )
permutation = copy( basePermutation )
#apply n random transpositions (including identical) to base permutation
# for i in xrange( n ):
k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
return permutation
In [ ]:
decrypt
In [45]:
def decrypt(permutation, encrypted):
decrypted = []
for i in encrypted:
decrypted.append(chr(permutation[ord(i)-97]+97))
return ''.join(decrypted)
In [46]:
#TEST TEXT
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted, p = crypt(original)
#TRAIN TEXT
train_text = read_text_whole('main/war_and_peace.txt')
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
print 'encrypting permutation: ', p
bp = np.zeros(26, dtype=int)
for i in p:
bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print 'inverse to encrypting permutation: ', bp
print 'its likelihood: ', q
ra = uniform(26)
q = get_desiredPDF_bigram(ra)
print 'likelihood of random permutation: ', q
In [82]:
import time
iterations = [250,500,1000,2000]
qs = np.zeros(16)
for k in xrange(4):
j=0
init_p = uniform(26)
for it in iterations:
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, it)
x = []
y = []
for i in xrange( 500 ):
a,b = metropolisgenerator.next()
x.append(a)
y.append(b)
et = time.time() - st
print 'cold iterations: ', it
print 'metropolis time: ', et
best = np.argmax(y)
bestx = x[best]
print 'best density among ', 500, ' last iterations: ', y[best]
print 'corresponding permutation: ', bestx
decrypted = decrypt(bestx, encrypted)
qs[4*k+j] = quality(decrypted, original)
print 'quality: ', qs[4*k+j]
j+=1
plt.plot(iterations, qs[:4], iterations, qs[4:8], iterations, qs[8:12], iterations, qs[12:16])
Out[82]:
In [ ]:
plt.savefig('Bigram,noWordDelimiter,metrop,varyColdIters')
In [83]:
iterations = [250,500,1000,2000]
qs = np.zeros(16)
for k in xrange(4):
j=0
init_p = uniform(26)
for it in iterations:
st = time.time()
computableGen = lambda t: applyedTranspostions(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 0)
x = []
y = []
for i in xrange( it ):
a,b = metropolisgenerator.next()
x.append(a)
y.append(b)
et = time.time() - st
print 'cold iterations: ',
print 'metropolis time: ', et
best = np.argmax(y)
bestx = x[best]
print 'best density among ', it, ' last iterations: ', y[best]
print 'corresponding permutation: ', bestx
decrypted = decrypt(bestx, encrypted)
qs[4*k+j] = quality(decrypted, original)
print 'quality: ', qs[4*k+j]
j+=1
plt.plot(iterations, qs[:4], iterations, qs[4:8], iterations, qs[8:12], iterations, qs[12:16])
Out[83]:
In [84]:
plt.savefig('Bigram,noWordDelimiter,metrop,varyIters')
In [2]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
iterations = [250, 500, 1000, 2000]
stat_raw = np.array([
12.8880000114, 0.123917748918,
26.8050000668, 0.492153679654,
51.4409999847, 0.577245670996,
106.414999962, 1.0,
13.2339999676, 0.106466450216,
26.1779999733, 0.0572691197691,
51.9179999828, 0.686327561328,
108.29700017, 1.0,
12.7339999676, 0.136228354978,
25.6190001965, 0.762490981241,
51.6050000191, 0.910984848485,
104.585999966, 1.0,
12.986000061, 0.000631313131313,
25.6989998817, 0.703373015873,
53.864000082, 0.135326479076,
108.180000067, 1.0]).reshape((16, 2))
stat = np.array([stat_raw[::4, 1], stat_raw[1::4, 1], stat_raw[2::4, 1], stat_raw[3::4, 1]])
means = np.mean(stat, 1)
stds = np.std(stat, 1)
print(means)
print(stds)
plt.title('Dependence quality on cold iterations')
plt.xlabel('iterations')
plt.ylabel('quolity')
plt.plot(iterations, means - stds, 'r:')
plt.plot(iterations, means + stds, 'r:')
plt.plot(iterations, means, 'b-')
plt.savefig('task-2-daniel.png')
In [ ]: